Linear regression I

Published

September 2, 2024

Objectives

Setup

Code
stopifnot(
  require(broom),
  require(corrr),
  require(DT), 
  require(GGally),
  require(ggforce),
  require(glue),
  require(gt),
  require(httr),
  require(kableExtra),
  require(lobstr),
  require(magrittr),
  require(patchwork),
  require(rlang),
  require(skimr),
  require(fs),
  require(tidyverse),
  require(viridis)
)

old_theme <- theme_set(theme_minimal())

knitr::opts_chunk$set(
  message = FALSE,
  warning = FALSE,
  comment=NA,
  prompt=FALSE,
  cache=FALSE,
  echo=TRUE,
  results='asis'
)

Dataset

Dataset Banque.csv contains information on clerical officers in the Banking sector. We aim at investigating connections between variables.

Per-column analysis

Code
if (fs::dir_exists('DATA')){
  datapath <- '/DATA'
} else {
  datapath <- '../DATA'
}
Code
fpath <- str_c(datapath, "Banque.csv", sep="/")  # tune this

bank <- readr::read_delim(fpath, delim = "\t", 
    escape_double = FALSE, 
    col_types = cols(
        SEXE = col_character(), 
        CATEGORIE = col_character(), 
        NB_ETUDES = col_character()), 
    trim_ws = TRUE)

# View(bank)

bank %>%
  glimpse(withd=50)
Rows: 474
Columns: 14
$ SEXE            <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",…
$ AGE             <dbl> 64, 55, 56, 60, 62, 61, 62, 63, 59, 61, 52, 57, 55, 52…
$ CATEGORIE       <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",…
$ NB_ETUDES       <chr> "8", "8", "8", "12", "12", "12", "12", "8", "8", "12",…
$ EXPERIENCE      <dbl> 275, 43, 0, 0, 180, 163, 288, 412, 76, 124, 271, 319, …
$ ANCIENNETE      <dbl> 70, 74, 92, 82, 68, 66, 84, 88, 76, 97, 72, 72, 85, 81…
$ SAL_EMBAUCHE    <dbl> 10200, 10200, 9750, 10200, 10200, 10200, 10200, 9750, …
$ SAL_ACTUEL      <dbl> 15750, 15900, 16200, 16200, 16200, 16350, 16500, 16650…
$ VILLE           <chr> "PARIS", "PARIS", "LILLE", "BORDEAUX", "PARIS", "PARIS…
$ SATIS_EMPLOI    <chr> "non", "non", "oui", "non", "non", "non", "oui", "non"…
$ SATIS_CHEF      <chr> "oui", "oui", "non", "non", "oui", "oui", "non", "non"…
$ SATIS_SALAIRE   <chr> "non", "non", "non", "non", "oui", "non", "non", "non"…
$ SATIS_COLLEGUES <chr> "non", "non", "oui", "oui", "oui", "non", "non", "oui"…
$ SATIS_CE        <chr> "oui", "oui", "oui", "oui", "oui", "oui", "oui", "oui"…
The table schema is the following

-   SEXE :
    -   "0" : Man,
    -   "1" : Woman.
-   AGE : in years.
-   CATEGORIE : Employment category (from 1 to 7).
-   NB_ETUDES : Number of years of education
-   EXPERIENCE : Previous Expérience antérieure (in months).
-   ANCIENNETE : Seniority in this bank (in months).
-   SAL_EMBAUCHE : Starting salary (Euros).
-   SAL_ACTUEL : Present salary (Euros).
-   VILLE : City of residence
-   SATIS_EMPLOI : Satisfied with your job?
-   SATIS_CHEF : Satisfied with your manager?
-   SATIS_SALAIRE : Satisfied with your salary?
-   SATIS_COLLEGUES : Satisfied with your colleagues?
-   SATIS_CE : Happy with your works council?
- Population: Bank employees in France
- Sample: Those employees who answered the questionnaire (we have no clues about possible selection bias)
Code
make_biotifoul <-  function(df, .f=is.factor){
  .scales <- ifelse(identical(.f, is.factor), "free_x", "free")

  p <- df %>%
    select(where(.f)) %>%
    pivot_longer(
      cols = everything(),
      names_to = "var",
      values_to = "val"
    ) %>%
    ggplot() +
    aes(x = val) +
    facet_wrap(~var, scales=.scales) + xlab("")

  if(identical(.f, is.factor)){
    p + geom_bar()
  } else {
    p + geom_histogram(aes(y=after_stat(density)), bins=30) + xlab("")
  }
}

Preliminary inspection.

Code
bank %>%
  skim() %>% 
  DT::datatable(extensions=c("Responsive"))

All columns with less than 10 distinct values but more than 2 values will be considered as factors.

Code
to_factorize <- bank %>%
  summarise(across(everything(), n_distinct)) %>%
  pivot_longer(cols=everything(),
               names_to="name",
               values_to = "n_distinct") %>%
  filter(n_distinct<= 10) %>%
  pull("name")
Code
bank <- bank %>% 
  mutate(
    across(starts_with('SATIS_'),    # tidy selection 
           ~ ifelse(.=="oui", TRUE, FALSE))
  )

bank <- bank %>% 
  mutate(SEXE=ifelse(SEXE==1, "F", "M")) 

to_factorize <- bank %>% 
  select(-where(is_logical)) %>%   # tidy selection 
  summarise(across(everything(), n_distinct)) %>%  # tidy selection 
  pivot_longer(everything(),
               names_to = "col_name", 
               values_to = "n_distinct") %>% 
  filter(n_distinct<=10) %>% 
  pluck("col_name")

bank <- bank %>% 
  mutate(across(all_of(to_factorize), as_factor))  # tidy selection 
Code
bank <- bank %>%
  mutate(SEXE= fct_recode(SEXE, "M"="0", "F"="1"))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `SEXE = fct_recode(SEXE, M = "0", F = "1")`.
Caused by warning:
! Unknown levels in `f`: 0, 1

It looks better!

Code
bank %>%
  skim(where(is.numeric)) %>% 
  DT::datatable(extensions = c("Responsive"))
Code
bank %>%
  skim(where(is.factor)) %>% 
  DT::datatable(extensions = c("Responsive"))
Code
bank %>%
  skim(where(is.logical)) %>% 
  DT::datatable(extensions = c("Responsive"))

We add an identifier column so as to identify rows

Code
bank <- bank %>%
  rownames_to_column(var="id")
Code
bank %>%
  select(-id) %>%
  make_biotifoul(.f=is.factor)

Code
bank %>%
  select(-id) %>%
  make_biotifoul(.f=is.numeric)

Pairwise scan

Use pairs() of ggpairs() to scan pairwise interactions between columns

ggpairs() explores all pairwise interactions. It is time-consuming.

Code
# bank %>%
#   ggpairs()

Function pairs works with numerical columns

Code
bank %>% 
  select(where(is.numeric)) %>% 
  pairs()

Pairwise interactions between numerical columns of bank dataset

As we intend to explain SAL_ACTUEL as a function of the other variables, the last row is interesting. SAL_EMBAUCHE looks more promising than the three other covariates.

Code
bank %>% 
  select(where(is.numeric)) %>% 
  pivot_longer(cols=-c("SAL_ACTUEL"),
               names_to="covariate",
               values_to = "X")  %>% 
  ggplot() +
    aes(y=SAL_ACTUEL, x=X) +
    geom_point(alpha=.5, size=.5) +
    facet_wrap(~ covariate, scales = "free_x") +
    xlab("") +
  ggtitle("SAL_ACTUEL versus other numerical covariates",
          subtitle="Banque dataset")

Pairwise interactions between numerical covariates and and response variable SAL_ACTUEL

Linear correlation coefficient

We first investigate connexion between salary at hiring time (SAL_EMBAUCHE) and current salary (SAL_ACTUEL).

Code
p_scat <- bank %>%
  ggplot() +
  aes(x=SAL_EMBAUCHE, y=SAL_ACTUEL) +
  geom_point(alpha=.5, size=.5 ) +
#  geom_jitter(alpha=.25, size=.5)
  ggtitle("Bank dataset")

Whith correlation coefficient.

Code
rho <- cor(bank$SAL_ACTUEL, bank$SAL_EMBAUCHE)

p_scat_reg_lin <- p_scat +
  geom_smooth(method="lm", formula= y ~ x, se=F) +
  annotate(geom="text",
           x=60000, y=80000,
           label=glue("rho = {round(rho, 2)}"))

(p_scat + ylim(c(0,160000))) + (p_scat_reg_lin + ylim(c(0,160000)))

Code
rho <- cor(bank$SAL_ACTUEL, bank$SAL_EMBAUCHE)

lm_1 <- lm(SAL_ACTUEL ~ SAL_EMBAUCHE , data=bank)

p_scat_1 <- bank %>% 
  ggplot() +
  aes(x=SAL_EMBAUCHE, y= SAL_ACTUEL) +
  geom_point(alpha=.5, size=.5) +
  annotate(geom="text", x=60000, y=60000, label=str_c("rho: ", round(rho,2))) +
  geom_smooth(method = "lm", formula = y ~ x, se=F) +
  geom_smooth(method= "loess", formula = y ~ x, se=F, color="red") +
  geom_abline(slope=coefficients(lm_1)[2], 
              intercept =coefficients(lm_1)[1], color="green" ) # +
#  coord_fixed()

p_scat_1  +
  xlim(c(-1000, 1e5)) + ylim(c(-1000, 2e5))

Code
p_scat_1  +
  ggforce::facet_zoom(xlim=c(-1000,25000),ylim=c(-1000,50000))

Linear fit using ordinary least squares (OLS)

Code
lm_1 <- lm(formula = SAL_ACTUEL ~ SAL_EMBAUCHE, data=bank)

lm2str_frm <- . %>%
  formula() %>%
  deparse()

frm_1 <- lm2str_frm(lm_1)

summary(lm_1)

Call: lm(formula = SAL_ACTUEL ~ SAL_EMBAUCHE, data = bank)

Residuals: Min 1Q Median 3Q Max -35424 -4031 -1154 2584 49293

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.928e+03 8.887e+02 2.17 0.0305 *
SAL_EMBAUCHE 1.909e+00 4.741e-02 40.28 <2e-16 ***


Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

Residual standard error: 8115 on 472 degrees of freedom Multiple R-squared: 0.7746, Adjusted R-squared: 0.7741 F-statistic: 1622 on 1 and 472 DF, p-value: < 2.2e-16

Code
cor(lm_1$fitted.values, bank$SAL_ACTUEL)^2

[1] 0.7746068

Code
var(lm_1$fitted.values)/var(bank$SAL_ACTUEL)

[1] 0.7746068

Code
lm_1 %>%
  tidy() %>%
  knitr::kable(digit=2, caption = frm_1)
SAL_ACTUEL ~ SAL_EMBAUCHE
term estimate std.error statistic p.value
(Intercept) 1928.21 888.68 2.17 0.03
SAL_EMBAUCHE 1.91 0.05 40.28 0.00
Code
lm_1 %>%
  glance() %>%
  knitr::kable(digit=2, caption = frm_1)
SAL_ACTUEL ~ SAL_EMBAUCHE
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.77 0.77 8115.36 1622.12 0 1 -4938.29 9882.58 9895.07 31085446686 472 474
Code
lm_1_aug <- lm_1 %>%
  augment(data=bank)

lm_1_aug %>%
  head() %>%
  knitr::kable(digits=2, caption = frm_1)
SAL_ACTUEL ~ SAL_EMBAUCHE
id SEXE AGE CATEGORIE NB_ETUDES EXPERIENCE ANCIENNETE SAL_EMBAUCHE SAL_ACTUEL VILLE SATIS_EMPLOI SATIS_CHEF SATIS_SALAIRE SATIS_COLLEGUES SATIS_CE .fitted .resid .hat .sigma .cooksd .std.resid
1 F 64 1 8 275 70 10200 15750 PARIS FALSE TRUE FALSE FALSE TRUE 21404.59 -5654.59 0 8119.77 0 -0.70
2 F 55 1 8 43 74 10200 15900 PARIS FALSE TRUE FALSE FALSE TRUE 21404.59 -5504.59 0 8119.99 0 -0.68
3 F 56 1 8 0 92 9750 16200 LILLE TRUE FALSE FALSE TRUE TRUE 20545.34 -4345.34 0 8121.49 0 -0.54
4 F 60 1 12 0 82 10200 16200 BORDEAUX FALSE FALSE FALSE TRUE TRUE 21404.59 -5204.59 0 8120.41 0 -0.64
5 F 62 1 12 180 68 10200 16200 PARIS FALSE TRUE TRUE TRUE TRUE 21404.59 -5204.59 0 8120.41 0 -0.64
6 F 61 1 12 163 66 10200 16350 PARIS FALSE TRUE FALSE FALSE TRUE 21404.59 -5054.59 0 8120.61 0 -0.62
Code
lm_1_aug %>%
  select(starts_with(".")) %>%
  head() %>%
  knitr::kable(digits=2, caption = frm_1)
SAL_ACTUEL ~ SAL_EMBAUCHE
.fitted .resid .hat .sigma .cooksd .std.resid
21404.59 -5654.59 0 8119.77 0 -0.70
21404.59 -5504.59 0 8119.99 0 -0.68
20545.34 -4345.34 0 8121.49 0 -0.54
21404.59 -5204.59 0 8120.41 0 -0.64
21404.59 -5204.59 0 8120.41 0 -0.64
21404.59 -5054.59 0 8120.61 0 -0.62

Let base R produce diagnostic plots

Code
plot(lm_1, which = 1:6)

We will reproduce (and discuss) four of the six diagnostic plots provided by the plot method from base R (1,2,3,5).

Code
p_1_lm_1 <- lm_1_aug %>%
  ggplot() +
  aes(x=.fitted, y=.resid)+
  geom_point(alpha=.5, size=.5) +
  geom_smooth(formula = y ~ x,
              method="loess",
              se=F,
              linetype="dotted",
              linewidth=.5,
              color="black") +
  xlab("Fitted values") +
  ylab("Residuals)") +
  ggtitle("Bank dataset",
          subtitle = frm_1) +
  labs(caption = "Residuals versus Fitted")
Code
make_p_diag_1 <- function(lm.){
  augment(lm.) %>%
  ggplot() +
  aes(x=.fitted, y=.resid)+
  geom_point(alpha=.5, size=.5) +
  geom_smooth(method="loess",
              formula = y ~ x,
              se=F,
              linetype="dotted",
              size=.5,
              color="black") +
  xlab("Fitted values") +
  ylab("Residuals)") +
  labs(title = "Residuals versus Fitted")
}
Code
p_3_lm_1 <- lm_1_aug %>%
  ggplot() +
  aes(x=.fitted, y=sqrt(abs(.std.resid))) +
  geom_smooth(formula = y ~ x,
              se=F,
              method="loess",
              linetype="dotted",
              size=.5,
              color="black") +
  xlab("Fitted values") +
  ylab("sqrt(standardized residuals)") +
  geom_point(size=.5, alpha=.5) +
  ggtitle("Bank dataset",
          subtitle = frm_1) +
  labs(caption = "Scale location")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Code
# plot(lm.1, which=3)
Code
make_p_diag_3 <-  function(lm.){
  augment(lm.) %>%
  ggplot() +
  aes(x=.fitted, y=sqrt(abs(.std.resid))) +
  geom_smooth(formula = y ~ x,
              method="loess",
              se=F,
              linetype="dotted",
              size=.5,
              color="black") +
  xlab("Fitted values") +
  ylab("sqrt(standardized residuals)") +
  geom_point(size=.5, alpha=.5) +
  labs(title = "Scale location")
}
Code
p_5_lm_1 <- lm_1_aug %>%
  ggplot() +
  aes(x=.hat, y=((.std.resid))) +
  geom_point(size=.5, alpha=.5) +
  xlab("Leverage") +
  ylab("Standardized residuals") +
  ggtitle("Bank dataset",
           subtitle = frm_1)

# plot(lm.1, which = 5)
Code
make_p_diag_5 <-  function(lm.){
  augment(lm.) %>%
  ggplot() +
  aes(x=.hat, y=((.std.resid))) +
  geom_point(size=.5, alpha=.5) +
  xlab("Leverage") +
  ylab("Standardized residuals") +
  labs(title = "Standardized residulas versus Leverages")
}

In the second diagnostic plot (the residuals qqplot), we build a quantile-quantile plot by plotting function \(F_n^{\leftarrow} \circ \Phi\) where \(\Phi\) is the ECDF of the standard Gaussian distribution while \(F^\leftarrow_n\).

Code
p_2_lm_1 <- lm_1_aug %>%
  ggplot() +
  aes(sample=.resid) +
  geom_qq(size=.5, alpha=.5) +
  stat_qq_line(linetype="dotted",
              size=.5,
              color="black") +
  ggtitle("Bank dataset",
          subtitle = frm_1) +
  labs(caption="Residuals qqplot") +
  xlab("Theoretical quantiles") +
  ylab("Empirical quantiles of residuals")

# plot(lm_1, which = 2)
Code
make_p_diag_2 <-  function(lm.){
  augment(lm.) %>%
  ggplot() +
  aes(sample=.resid) +
  geom_qq(size=.5, alpha=.5) +
  stat_qq_line(linetype="dotted",
              size=.5,
              color="black") +
  labs(title="Residuals qqplot")
}
Code
lyt <- patchwork::plot_layout(ncol=2, nrow=2)

make_p_diag_1(lm_1) +
make_p_diag_2(lm_1) +
make_p_diag_3(lm_1) +
make_p_diag_5(lm_1)    # DRY this ?

SAL_ACTUEL ~ SAL_EMBAUCHE
Code
p_1_lm_1 + p_2_lm_1 + p_3_lm_1 + p_5_lm_1
Code
p_1_bis_lm_1 <- lm_1_aug %>%
  ggplot() +
  aes(x=.fitted, y=SAL_ACTUEL)+
  geom_point(alpha=.5, size=.5) +
  geom_smooth(formula = y ~ x,
              method="loess",
              se=F,
              linetype="dotted",
              size=.5,
              color="black") +
  xlab("Fitted values") +
  ylab("SAL_ACTUEL") +
  ggtitle("Bank dataset",
          subtitle = frm_1) +
  labs(caption = "SAL_ACTUEL versus Fitted")

p_1_bis_lm_1

Play it again with AGE and SAL_ACTUEL

Code
lm_2 <- lm(SAL_ACTUEL ~ AGE, data=bank)

lm_2 %>%
  tidy()
# A tibble: 2 × 5
  term        estimate std.error statistic  p.value
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)   42272.    2571.      16.4  2.30e-48
2 AGE            -211.      65.8     -3.20 1.45e- 3
Code
lyt <- patchwork::plot_layout(ncol=2, nrow=2)

make_p_diag_1(lm_2) +
  make_p_diag_2(lm_2) +
  make_p_diag_3(lm_2) +
  make_p_diag_5(lm_2)

SAL_ACTUEL ~ AGE
Code
bank %>%
  ggplot() +
  aes(x=AGE, y=SAL_ACTUEL) +
  geom_point(alpha=.5, size=.5, ) +
  geom_smooth(method="lm", formula= y ~ x, se=F) +
  annotate(geom="text", x=60, y=60000,
           label=str_c("rho = ",
                       round(cor(bank$SAL_ACTUEL, bank$AGE), 2))) +
  ggtitle("Bank dataset")

Inspect rows with high Cook’s distance

Code
lm_1_aug %>%
  filter(.cooksd> 2*mean(.cooksd)) %>%
  select(-starts_with(".")) %>%
  DT::datatable()
Code
bank %>%
  select(-id) %>%
  select(where(is.numeric)) %>%
  corrr::correlate() %>%
  corrr::shave() %>%
  corrr::rplot()
Correlation computed with
• Method: 'pearson'
• Missing treated using: 'pairwise.complete.obs'

Predictive linear regression of SAL_ACTUEL as a function of age AGE

To perform linear fitting, we choose \(450\) points amongst the \(474\) sample points: the \(24\) remaining points are used to assess the merits of the linear fit.

Code
old_seed <- set.seed(42)

trainset_size <-  450

trainset <- sample(pluck(bank, "id") , trainset_size)

testset <- setdiff(pluck(bank, "id") , trainset)

trainset <- as.integer(trainset)
testset <- as.integer(testset)

# foo <- slice_sample(bank, n = trainset_size)